library(ArchR)
library(tidyverse)
proj <- loadArchRProject("12_Copy/")
proj <- addKathiScoreMatrix(
input = proj,
genes = getGenes(proj),
peaks = getPeakSet(proj),
geneModel = "exp(-abs(x)/5000)",
matrixName = "GeneScoreMatrix",
extendUpstream = c(1000, 100000),
extendDownstream = c(1000, 100000),
#geneUpstream = 5000, #New Param
#geneDownstream = 0, #New Param
useGeneBoundaries = TRUE,
useTSS = TRUE, #New Param
extendTSS = FALSE,
tileSize = 500,
ceiling = 4,
geneScaleFactor = 5, #New Param
scaleTo = 10000,
excludeChr = c("chrY", "chrM"),
blacklist = getBlacklist(input),
threads = getArchRThreads(),
parallelParam = NULL,
subThreading = FALSE,
force = TRUE,
logFile = createLogFile(".addKathiGeneScoreMat"))
proj <- loadArchRProject("12_Copy/")
i = 1
ArrowFiles = getArrowFiles(proj)
genes = getGenes(proj)
peaks = getPeakSet(proj)
geneModel = "exp(-abs(x)/5000)"
matrixName = "GeneScoreMatrix"
extendUpstream = c(1000, 100000)
extendDownstream = c(1000, 100000)
geneUpstream = 5000 #New Param
geneDownstream = 0 #New Param
useGeneBoundaries = TRUE
useTSS = TRUE #New Param
extendTSS = FALSE
tileSize = 500
ceiling = 4
geneScaleFactor = 5 #New Param
scaleTo = 10000
excludeChr = c("chrY","chrM")
blacklist = getBlacklist(proj)
cellNames = NULL
allCells = NULL
force = FALSE
tmpFile = NULL
subThreads = 1
tstart = NULL
logFile = NULL
# addGeneScoreMatrix <- function(
# input = NULL,
# genes = getGenes(input),
# geneModel = "exp(-abs(x)/5000) + exp(-1)",
# matrixName = "GeneScoreMatrix",
# extendUpstream = c(1000, 100000),
# extendDownstream = c(1000, 100000),
# geneUpstream = 5000, #New Param
# geneDownstream = 0, #New Param
# useGeneBoundaries = TRUE,
# useTSS = FALSE, #New Param
# extendTSS = FALSE,
# tileSize = 500,
# ceiling = 4,
# geneScaleFactor = 5, #New Param
# scaleTo = 10000,
# excludeChr = c("chrY", "chrM"),
# blacklist = getBlacklist(input),
# threads = getArchRThreads(),
# parallelParam = NULL,
# subThreading = TRUE,
# force = FALSE,
# logFile = createLogFile("addGeneScoreMatrix")
# ){
.validInput(input = input, name = "input", valid = c("ArchRProj", "character"))
.validInput(input = genes, name = "genes", valid = c("GRanges"))
.validInput(input = geneModel, name = "geneModel", valid = c("character"))
.validInput(input = matrixName, name = "matrixName", valid = c("character"))
.validInput(input = extendUpstream, name = "extendUpstream", valid = c("integer"))
.validInput(input = extendDownstream, name = "extendDownstream", valid = c("integer"))
.validInput(input = tileSize, name = "tileSize", valid = c("integer"))
.validInput(input = ceiling, name = "ceiling", valid = c("integer"))
.validInput(input = useGeneBoundaries, name = "useGeneBoundaries", valid = c("boolean"))
.validInput(input = scaleTo, name = "scaleTo", valid = c("numeric"))
.validInput(input = excludeChr, name = "excludeChr", valid = c("character", "null"))
.validInput(input = blacklist, name = "blacklist", valid = c("GRanges", "null"))
.validInput(input = threads, name = "threads", valid = c("integer"))
.validInput(input = parallelParam, name = "parallelParam", valid = c("parallelparam", "null"))
.validInput(input = force, name = "force", valid = c("boolean"))
.validInput(input = logFile, name = "logFile", valid = c("character"))
proj <- loadArchRProject("12_Copy/")
i = 1
ArrowFiles = getArrowFiles(proj)
genes = getGenes(proj)
peaks = getPeakSet(proj)
geneModel = "exp(-abs(x)/5000)"
matrixName = "GeneScoreMatrix"
extendUpstream = c(1000, 100000)
extendDownstream = c(1000, 100000)
geneUpstream = 5000 #New Param
geneDownstream = 0 #New Param
useGeneBoundaries = TRUE
useTSS = TRUE #New Param
extendTSS = FALSE
tileSize = 500
ceiling = 4
geneScaleFactor = 5 #New Param
scaleTo = 10000
excludeChr = c("chrY","chrM")
blacklist = getBlacklist(proj)
cellNames = NULL
allCells = NULL
force = FALSE
tmpFile = NULL
subThreads = 1
tstart = NULL
logFile = NULL
input = proj
logFile = createLogFile("addGeneScoreMatrix")
matrixName <- ArchR:::.isProtectedArray(matrixName, exclude = "GeneScoreMatrix")
if(inherits(input, "ArchRProject")){
ArrowFiles <- getArrowFiles(input)
allCells <- rownames(getCellColData(input))
outDir <- getOutputDirectory(input)
}else if(inherits(input, "character")){
outDir <- ""
ArrowFiles <- input
allCells <- NULL
}else{
stop("Error Unrecognized Input!")
}
if(!all(file.exists(ArrowFiles))){
stop("Error Input Arrow Files do not all exist!")
}
if(inherits(mcols(genes)$symbol, "list") | inherits(mcols(genes)$symbol, "SimpleList")){
stop("Found a list in genes symbol! This is an incorrect format. Please correct your genes!")
}
#ArchR:::.startLogging(logFile = logFile)
#ArchR:::.logThis(mget(names(formals()),sys.frame(sys.nframe())), "addGeneScoreMatrix Input-Parameters", logFile = logFile)
#Valid GRanges
genes <- ArchR:::.validGRanges(genes)
# #Add args to list
# args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call())
# args$ArrowFiles <- ArrowFiles
# args$allCells <- allCells
# args$X <- seq_along(ArrowFiles)
# args$FUN <- .addGeneScoreMat
# args$registryDir <- file.path(outDir, "GeneScoresRegistry")
# args$logFile <- logFile
#
# if(subThreading){
# h5disableFileLocking()
# }else{
# args$threads <- length(inputFiles)
# }
#
# #Remove Input from args
# args$input <- NULL
#
# #Run With Parallel or lapply
# outList <- .batchlapply(args)
#
# if(subThreading){
# h5enableFileLocking()
# }
#
# .endLogging(logFile = logFile)
#
# if(inherits(input, "ArchRProject")){
#
# return(input)
#
# }else{
#
# return(unlist(outList))
#
# }
#proj <- loadArchRProject("12_Ricards_peaks_ChromVar/")
#saveArchRProject(proj, "12_Copy")
#proj <- loadArchRProject("12_Copy/")
input = proj
ArrowFiles <- getArrowFiles(proj)
geneModel = "exp(-abs(x)/5000)"
matrixName = "GeneScoreMatrix"
genes = getGenes(proj)
useTSS = TRUE
extendUpstream = c(1000, 100000)
extendDownstream = c(1000, 100000)
useTSS = TRUE
tileSize = 500
ceiling = 4
blacklist = getBlacklist(proj)
scaleTo = 10000
excludeChr = c("chrY","chrM")
geneScaleFactor = 5
cellNames = NULL
allCells = NULL
force = FALSE
tmpFile = NULL
subThreads = 1
tstart = NULL
logFile = NULL
useGeneBoundaries = TRUE
.addGeneScoreMat <- function(
i = NULL,
ArrowFiles = NULL,
genes = NULL,
geneModel = "exp(-abs(x)/5000) + exp(-1)",
matrixName = "GeneScoreMatrix",
extendUpstream = c(1000, 100000),
extendDownstream = c(1000, 100000),
geneUpstream = 5000, #New Param
geneDownstream = 0, #New Param
useGeneBoundaries = TRUE,
useTSS = FALSE, #New Param
extendTSS = TRUE,
tileSize = 500,
ceiling = 4,
geneScaleFactor = 5, #New Param
scaleTo = 10000,
excludeChr = c("chrY","chrM"),
blacklist = NULL,
cellNames = NULL,
allCells = NULL,
force = FALSE,
tmpFile = NULL,
subThreads = 1,
tstart = NULL,
logFile = NULL
){
.validInput(input = i, name = "i", valid = c("integer"))
.validInput(input = ArrowFiles, name = "ArrowFiles", valid = c("character"))
.validInput(input = genes, name = "genes", valid = c("GRanges"))
.validInput(input = geneModel, name = "geneModel", valid = c("character"))
.validInput(input = matrixName, name = "matrixName", valid = c("character"))
.validInput(input = extendUpstream, name = "extendUpstream", valid = c("integer"))
.validInput(input = extendDownstream, name = "extendDownstream", valid = c("integer"))
.validInput(input = tileSize, name = "tileSize", valid = c("integer"))
.validInput(input = ceiling, name = "ceiling", valid = c("integer"))
.validInput(input = useGeneBoundaries, name = "useGeneBoundaries", valid = c("boolean"))
.validInput(input = scaleTo, name = "scaleTo", valid = c("numeric"))
.validInput(input = excludeChr, name = "excludeChr", valid = c("character", "null"))
.validInput(input = blacklist, name = "blacklist", valid = c("GRanges", "null"))
.validInput(input = cellNames, name = "cellNames", valid = c("character", "null"))
.validInput(input = allCells, name = "allCells", valid = c("character", "null"))
.validInput(input = force, name = "force", valid = c("boolean"))
.validInput(input = tmpFile, name = "tmpFile", valid = c("character", "null"))
if(inherits(mcols(genes)$symbol, "list") | inherits(mcols(genes)$symbol, "SimpleList")){
stop("Found a list in genes symbol! This is an incorrect format. Please correct your genes!")
}
Compute for only one sample:
i = 1
ArrowFile <- ArrowFiles[i]
sampleName <- ArchR:::.sampleName(ArrowFile)
if(is.null(tmpFile)){
tmpFile <- ArchR:::.tempfile(pattern = paste0("tmp-", ArchR:::.sampleName(ArrowFile)))
}
#Check
o <- h5closeAll()
#o <- ArchR:::.createArrowGroup(ArrowFile = ArrowFile, group = matrixName, force = force, logFile = logFile)
# remove any chromosomes which we do not want to inlcude (X and Y in this case)
geneRegions <- genes[BiocGenerics::which(seqnames(genes) %bcni% excludeChr)]
print(paste0("Before filtering out the X and Y Chromosome we have ", length(genes), " genes, afterwards we have ", length(geneRegions), "."))
## [1] "Before filtering out the X and Y Chromosome we have 24368 genes, afterwards we have 24334."
geneRegions %>% head
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | gene_id symbol
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr1 3214482-3671498 - | 497097 Xkr4
## [2] chr1 4290846-4409241 - | 19888 Rp1
## [3] chr1 4490928-4497354 - | 20671 Sox17
## [4] chr1 4773198-4785726 - | 27395 Mrpl15
## [5] chr1 4807893-4846735 + | 18777 Lypla1
## [6] chr1 4857694-4897909 + | 21399 Tcea1
## -------
## seqinfo: 21 sequences from mm10 genome
# add information on chromsome names to the GRanges object
seqlevels(geneRegions) <- as.character(unique(seqnames(geneRegions)))
geneRegions <- geneRegions[!is.na(mcols(geneRegions)$symbol)]
print(paste0("After removing missing gene names, we are left with ", length(geneRegions), " genes"))
## [1] "After removing missing gene names, we are left with 24333 genes"
#Create Gene Regions Then Remove Strand Column
if(useTSS){
ArchR:::.logMessage(paste0(sampleName, " .addGeneScoreMat useTSS = TRUE"))
distMethod <- "GenePromoter"
geneRegions$geneStart <- start(resize(geneRegions, 1, "start"))
geneRegions$geneEnd <- start(resize(geneRegions, 1, "end"))
# in the ranges column we want to have the gene start coordinates
geneRegions <- resize(geneRegions, 1, "start")
# this will be ignored for now, since we only want to use the TSS as 1bp size
# if(extendTSS){
# geneRegions <- extendGR(gr = geneRegions, upstream = geneUpstream, downstream = geneDownstream)
# }
geneRegions$geneWeight <- geneScaleFactor
# This will be ignored for now, since we do not want to use gene bodies, but only TSS
}else{
ArchR:::.logMessage(paste0(sampleName, " .addGeneScoreMat useTSS = FALSE"))
distMethod <- "GeneBody"
geneRegions$geneStart <- start(resize(geneRegions, 1, "start"))
geneRegions$geneEnd <- start(resize(geneRegions, 1, "end"))
geneRegions <- extendGR(gr = geneRegions, upstream = geneUpstream, downstream = geneDownstream)
m <- 1 / width(geneRegions)
geneRegions$geneWeight <- 1 + m * (geneScaleFactor - 1) / (max(m) - min(m))
}
#ArchR:::.logDiffTime(sprintf("Computing Gene Scores using distance relative to %s! ", distMethod), tstart, logFile = logFile)
#Add Gene Index For ArrowFile
geneRegions <- sort(sortSeqlevels(geneRegions), ignore.strand = TRUE)
#ArchR:::.logThis(geneRegions, paste0(sampleName, " .addGeneScoreMat geneRegions"), logFile = logFile)
# split geneRegions into a list of GRanges object, one for each chromosome
#and add indices for each chromosome
geneRegions <- split(geneRegions, seqnames(geneRegions))
geneRegions <- lapply(geneRegions, function(x){
mcols(x)$idx <- seq_along(x)
return(x)
})
#Blacklist Split
if(!is.null(blacklist)){
if(length(blacklist) > 0){
# create a list of length of number of chromosomes
# each element of the list contains the GRanges for the corresponding chromosome only
blacklist <- split(blacklist, seqnames(blacklist))
}
}
#Get all cell ids before constructing matrix
if(is.null(cellNames)){
cellNames <- ArchR:::.availableCells(ArrowFile)
}
if(!is.null(allCells)){
cellNames <- cellNames[cellNames %in% allCells]
}
tstart <- Sys.time()
The function extendGR(gr, upstream, downstream) from
ArchR is handy whenever you want to extend genomic regions in a GRanges
object.
#########################################################################################################
#First we will write gene scores to a temporary path! rhdf5 delete doesnt actually delete the memory!
#########################################################################################################
totalGS <- ArchR:::.safelapply(seq_along(geneRegions), function(z){ # we loop over the z chromosomes here
# geneRegions is a list with one Granges object for each chromosome
totalGSz <- tryCatch({
ArchR:::.logDiffTime(sprintf("Creating Temp GeneScoreMatrix for %s, Chr (%s of %s)!", sampleName, z, length(geneRegions)),
tstart, verbose = FALSE, logFile = logFile)
#Get Gene Starts
# extract Granges for chromosome z
geneRegionz <- geneRegions[[z]]
# order the entire GRanges object according to index
geneRegionz <- geneRegionz[order(geneRegionz$idx)]
# get the chromosome information for this gene Region
chrz <- paste0(unique(seqnames(geneRegionz)))
#Read in Fragments
# we get a IRanges object that contains the start/end coordinate, width and
# the metadata column containing the sample and barcode for that sample
frag <- ArchR:::.getFragsFromArrow(ArrowFile, chr = chrz, out = "IRanges", cellNames = cellNames)
print("Lets have a look at the fragments:")
frag %>% head
print("Lets have a look at the fragment sizes:")
hist(width(frag), breaks = 200, main = paste0("Fragment size/width on ",chrz) ,
xlab = "Fragment size")
# we truncate the start to the start of a 500bp tile
fragSt <- trunc(start(frag)/tileSize) * tileSize
# the end will be the same coordinate if it lies wihtin the same 500bp tile
# the end will be the start coordinate of the next tile if the fragment ends in the next tile.
fragEd <- trunc(end(frag)/tileSize) * tileSize
# create a Rle vector which contains the number of times each cell appears in the frag IRanges object
fragBC <- rep(S4Vectors::match(mcols(frag)$RG, cellNames), 2)
rm(frag)
gc()
print(paste0("There are length ", length(fragSt), " start coordinates, but only ", length(unique(fragSt)), " unique inserts, meaining a lot of inserts are found in one and the same tiles." ))
print(paste0("There are length ", length(fragEd), " start coordinates, but only ", length(unique(fragEd)), " unique inserts, meaining a lot of inserts are found in one and the same tiles." ))
print(paste0("Out of all inserts, ", length(unique(c(fragSt,fragEd))), " are unique"))
#Unique Inserts, sorted by coordinates
uniqIns <- sort(unique(c(fragSt,fragEd)))
# these new insertion coordinates correspond to start/end of a tile where an insertion was found
#Construct tile by cell mat!
# i, j and x should have the same dimensions, creating the 3 columns, row, col and value x
matGS <- Matrix::sparseMatrix(
i = match(c(fragSt, fragEd), uniqIns),
j = as.vector(fragBC),
# each entry is one, meaning that an insertion was found in this tile
x = rep(1, 2*length(fragSt)),
dims = c(length(uniqIns), length(cellNames))
)
print(ggplot() + geom_bar(aes(x = matGS@x)) +
scale_y_log10() +
labs(title = "Number of insertions per tile before applying ceiling = 4",
x = "number of insertion", y = "log10(count)"))
if(!is.null(ceiling)){
matGS@x[matGS@x > ceiling] <- ceiling # restrict the number of counts per tile to the value stored in celing (4)
}
print(ggplot() + geom_bar(aes(x = matGS@x)) +
scale_y_log10() +
labs(title = "Number of insertions per tile after applying ceiling = 4",
x = "number of insertion", y = "log10(count)"))
#Unique Tiles
# create IRanges object for all tiles which have an insertion
uniqueTiles <- IRanges(start = uniqIns, width = tileSize)
#Clean Memory
rm(uniqIns, fragSt, fragEd, fragBC)
gc()
#Time to Overlap Gene Windows
#### maybe for later
if(useGeneBoundaries){
geneStartz <- start(resize(geneRegionz, 1, "start"))
geneEndz <- start(resize(geneRegionz, 1, "end"))
pminGene <- pmin(geneStartz, geneEndz)
pmaxGene <- pmax(geneStartz, geneEndz)
idxMinus <- BiocGenerics::which(strand(geneRegionz) != "-")
pReverse <- rep(max(extendDownstream), length(pminGene))
pReverse[idxMinus] <- rep(max(extendUpstream), length(idxMinus))
pReverseMin <- rep(min(extendDownstream), length(pminGene))
pReverseMin[idxMinus] <- rep(min(extendUpstream), length(idxMinus))
pForward <- rep(max(extendUpstream), length(pminGene))
pForward[idxMinus] <- rep(max(extendDownstream), length(idxMinus))
pForwardMin <- rep(min(extendUpstream), length(pminGene))
pForwardMin[idxMinus] <- rep(min(extendDownstream), length(idxMinus))
################################################################
#We will test when genes pass by another gene promoter
################################################################
#Start of Range is based on the max observed gene ranged <- direction
s <- pmax(
c(1, pmaxGene[-length(pmaxGene)] + tileSize),
pminGene - pReverse
)
s <- pmin(pminGene - pReverseMin, s)
#End of Range is based on the max observed gene ranged -> direction
e <- pmin(
c(pminGene[-1] - tileSize, pmaxGene[length(pmaxGene)] + pForward[length(pmaxGene)]),
pmaxGene + pForward
)
e <- pmax(pmaxGene + pForwardMin, e)
extendedGeneRegion <- IRanges(start = s, end = e)
idx1 <- which(pminGene - pReverseMin < start(extendedGeneRegion))
if(length(idx1) > 0){
stop("Error in gene boundaries minError")
}
idx2 <- which(pmaxGene + pForwardMin > end(extendedGeneRegion))
if(length(idx2) > 0){
stop("Error in gene boundaries maxError")
}
rm(s, e, pReverse, pReverseMin, pForward, pForwardMin, geneStartz, geneEndz, pminGene, pmaxGene)
###### interesting for me
}else{
# first we extend the gene region -> IRanges object, which contains start
# and end of the 200,000 bp gene regions/windows (+/- 100,000 from TSS)
extendedGeneRegion <- ranges(suppressWarnings(extendGR(geneRegionz, upstream = max(extendUpstream), downstream = max(extendDownstream))))
}
# Here I would like to use peaks which overlap instead of tiles
# each row of tmp contains index of GeneRegion and the index of Tiles with
# which it overlaps
tmp <- suppressWarnings(findOverlaps(extendedGeneRegion, uniqueTiles))
tmp %>% head
print(paste0("In total there are ", subjectLength(tmp), " tiles overlapping with genes found on this chromosome"))
print(paste0("The total number of tiles found on this chromosome is ", length(uniqueTiles)))
print(tmp %>% as.data.frame() %>%
group_by(queryHits) %>% # gene region
summarize(n = n()) %>% # get number of peaks overlapping with a gene region
ggplot() + geom_bar(aes(x = n)) +
labs(title = "number of tiles per gene region of size +/- 100kb from TSS",
x = "number of tiles"))
print(tmp %>% as.data.frame() %>%
group_by(queryHits) %>% # gene region
summarize(n = n()) %>% # get number of peaks overlapping with a gene region
ggplot() + geom_bar(aes(x = n)) +
labs(title = "number of tiles per gene region of size +/- 100kb from TSS",
x = "number of tiles", y = "log10(counts)") +
scale_y_log10())
# by extracting the indices of genes and tiles which overlap and computing
# the distances between them we get for each match the distance between gene and tile
x <- distance(ranges(geneRegionz)[queryHits(tmp)], uniqueTiles[subjectHits(tmp)])
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of absolute distances between genes and tiles within a gene region",
x = "distance"))
#Determine Sign for Distance relative to strand (Directionality determined based on dist from gene start)
# negative distance meaning upstrea, positive distance downstream of gene
# get Minus strand coordinates
isMinus <- BiocGenerics::which(strand(geneRegionz) == "-")
# subtract the gene start coordinate from the tile start coordinate -> relative distances
signDist <- sign(start(uniqueTiles)[subjectHits(tmp)] -
start(resize(geneRegionz,1,"start"))[queryHits(tmp)])
# convert the direction of distance for all distances corresponding to the negative strand
signDist[isMinus] <- signDist[isMinus] * -1
#Correct the orientation for the distance!
# x contains absolute distances, but we want to give direction to distances
x <- x * signDist
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of relative distances between genes and tiles within a gene region",
x = "relative distance to TSS"))
#Evaluate Input Model -> compute distance weights
x <- eval(parse(text=geneModel))
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of distance weights",
x = "distance weight"))
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of distance weights",
x = "distance weight", y = log10(count)))
print("Real distances: ")
x %>% head
#Get Gene Weights Related to Gene Width, in our case simply multiply
# everything by 5, because we do not use the size of the gene
x <- x * mcols(geneRegionz)$geneWeight[queryHits(tmp)]
print("Distances scaled by gene weight, which is constant in our case: ")
x %>% head
#Remove Blacklisted Tiles!
if(!is.null(blacklist)){
if(length(blacklist) > 0){
blacklistz <- blacklist[[chrz]]
if(is.null(blacklistz) | length(blacklistz) > 0){
tilesBlacklist <- 1 * (!overlapsAny(uniqueTiles, ranges(blacklistz)))
if(sum(tilesBlacklist == 0) > 0){
x <- x * tilesBlacklist[subjectHits(tmp)] #Multiply Such That All Blacklisted Tiles weight is now 0!
}
}
}
}
#Creating Sparse Matrix
tmp <- Matrix::sparseMatrix(
i = queryHits(tmp),
j = subjectHits(tmp),
x = x,
dims = c(length(geneRegionz), nrow(matGS))
)
#Calculate Gene Scores
matGS <- tmp %*% matGS
colnames(matGS) <- cellNames
totalGSz <- Matrix::colSums(matGS)
print(paste0("For chromosome ", chrz, " there are ", length(totalGSz), " cells, meaning that there will be as many total gene activity scores."))
print(ggplot() + geom_histogram(aes(x = totalGSz), bins = 500) +
labs(title = "Total Gene activity in each cell", x = "total gene activity"))
#Save tmp file
.safeSaveRDS(matGS, file = paste0(tmpFile, "-", chrz, ".rds"), compress = FALSE)
#Clean Memory
rm(isMinus, signDist, extendedGeneRegion, uniqueTiles)
rm(matGS, tmp)
gc()
totalGSz
}, error = function(e){
errorList <- list(
ArrowFile = ArrowFile,
geneRegions = geneRegions,
blacklist = blacklist,
chr = chrz,
totalGSz = if(exists("totalGSz", inherits = FALSE)) totalGSz else "totalGSz",
matGS = if(exists("matGS", inherits = FALSE)) matGS else "matGS"
)
# ArchR:::.logError(e, fn = "ArchR:::.addGeneScoreMat TmpGS", info = sampleName, errorList = errorList, logFile = logFile)
})
totalGSz
})#, threads = subThreads) %>% Reduce("+", .)
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 14266647 start coordinates, but only 369018 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 14266647 start coordinates, but only 368956 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 371687 are unique"
## [1] "In total there are 371687 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 371687"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 16815896 start coordinates, but only 340970 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 16815896 start coordinates, but only 340942 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 342982 are unique"
## [1] "In total there are 342982 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 342982"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 9013671 start coordinates, but only 220405 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 9013671 start coordinates, but only 220419 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 222711 are unique"
## [1] "In total there are 222711 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 222711"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 9474237 start coordinates, but only 223417 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 9474237 start coordinates, but only 223370 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 225269 are unique"
## [1] "In total there are 225269 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 225269"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 8475666 start coordinates, but only 222157 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 8475666 start coordinates, but only 222222 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 224609 are unique"
## [1] "In total there are 224609 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 224609"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 8848527 start coordinates, but only 194191 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 8848527 start coordinates, but only 194256 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 195574 are unique"
## [1] "In total there are 195574 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 195574"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 7053025 start coordinates, but only 182549 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 7053025 start coordinates, but only 182568 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 183799 are unique"
## [1] "In total there are 183799 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 183799"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 9421264 start coordinates, but only 176320 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 9421264 start coordinates, but only 176308 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 177748 are unique"
## [1] "In total there are 177748 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 177748"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 6814904 start coordinates, but only 169030 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 6814904 start coordinates, but only 169009 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 170035 are unique"
## [1] "In total there are 170035 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 170035"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 6464254 start coordinates, but only 112839 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 6464254 start coordinates, but only 112785 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 113523 are unique"
## [1] "In total there are 113523 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 113523"
## [1] "Lets have a look at the fragments:"
## [1] "Lets have a look at the fragment sizes:"
## [1] "There are length 4555885 start coordinates, but only 272415 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "There are length 4555885 start coordinates, but only 272527 unique inserts, meaining a lot of inserts are found in one and the same tiles."
## [1] "Out of all inserts, 279601 are unique"
## [1] "In total there are 279601 tiles overlapping with genes found on this chromosome"
## [1] "The total number of tiles found on this chromosome is 279601"
#########################################################################################################
#Organize info for ArchR Arrow
#########################################################################################################
featureDF <- Reduce("c",geneRegions) %>%
{data.frame(
row.names=NULL,
seqnames=as.character(seqnames(.)),
start=mcols(.)$geneStart,
end=mcols(.)$geneEnd,
strand=as.integer(strand(.)),
name=mcols(.)$symbol,
idx=mcols(.)$idx,
stringsAsFactors=FALSE)}
.logThis(featureDF, paste0(sampleName, " .addGeneScoreMat FeatureDF"), logFile = logFile)
dfParams <- data.frame(
extendUpstream = extendUpstream,
extendDownstream = extendDownstream,
geneUpstream = extendUpstream,
geneDownstream = extendDownstream,
scaleTo = scaleTo,
tileSize = tileSize,
ceiling = ceiling,
geneModel = geneModel,
stringsAsFactors=FALSE
)
######################################
# Initialize SP Mat Group
######################################
o <- .initializeMat(
ArrowFile = ArrowFile,
Group = matrixName,
Class = "double",
Units = "NormCounts",
cellNames = cellNames,
params = dfParams,
featureDF = featureDF,
force = TRUE
)
#Clean Memory
rm(dfParams, featureDF, genes)
gc()
#Normalize and add to Arrow File!
for(z in seq_along(geneRegions)){
o <- tryCatch({
#Get Chromosome
chrz <- paste0(unique(seqnames(geneRegions[[z]])))
.logDiffTime(sprintf("Adding GeneScoreMatrix to %s for Chr (%s of %s)!", sampleName, z, length(geneRegions)),
tstart, verbose = FALSE, logFile = logFile)
#Re-Create Matrix for that chromosome!
matGS <- readRDS(paste0(tmpFile, "-", chrz, ".rds"))
file.remove(paste0(tmpFile, "-", chrz, ".rds"))
#Normalize
matGS@x <- as.numeric(scaleTo * matGS@x/rep.int(totalGS, Matrix::diff(matGS@p)))
#Round to Reduce Digits After Final Normalization
matGS@x <- round(matGS@x, 3)
matGS <- Matrix::drop0(matGS)
#Write sparseMatrix to Arrow File!
o <- .addMatToArrow(
mat = matGS,
ArrowFile = ArrowFile,
Group = paste0(matrixName, "/", chrz),
binarize = FALSE,
addColSums = TRUE,
addRowSums = TRUE,
addRowVarsLog2 = TRUE #add for integration analyses
)
#Clean Memory
rm(matGS)
if(z %% 3 == 0 | z == length(geneRegions)){
gc()
}
}, error = function(e){
errorList <- list(
ArrowFile = ArrowFile,
geneRegions = geneRegions,
blacklist = blacklist,
chr = chrz,
mat = if(exists("mat", inherits = FALSE)) mat else "mat"
)
.logError(e, fn = ".addGeneScoreMat AddToArrow", info = sampleName, errorList = errorList, logFile = logFile)
})
}
return(ArrowFile)
}
i = 1
ArrowFiles = getArrowFiles(proj)
genes = getGenes(proj)
peaks = getPeakSet(proj)
geneModel = "exp(-abs(x)/5000)"
matrixName = "GeneScoreMatrix"
extendUpstream = c(1000, 100000)
extendDownstream = c(1000, 100000)
geneUpstream = 5000 #New Param
geneDownstream = 0 #New Param
useGeneBoundaries = TRUE
useTSS = TRUE #New Param
extendTSS = FALSE
tileSize = 500
ceiling = 4
geneScaleFactor = 5 #New Param
scaleTo = 10000
excludeChr = c("chrY","chrM")
blacklist = getBlacklist(proj)
cellNames = NULL
allCells = NULL
force = FALSE
tmpFile = NULL
subThreads = 1
tstart = NULL
logFile = NULL
input = proj
logFile = createLogFile("addGeneScoreMatrix")
matrixName <- ArchR:::.isProtectedArray(matrixName, exclude = "GeneScoreMatrix")
if(inherits(input, "ArchRProject")){
ArrowFiles <- getArrowFiles(input)
allCells <- rownames(getCellColData(input))
outDir <- getOutputDirectory(input)
}else if(inherits(input, "character")){
outDir <- ""
ArrowFiles <- input
allCells <- NULL
}else{
stop("Error Unrecognized Input!")
}
if(!all(file.exists(ArrowFiles))){
stop("Error Input Arrow Files do not all exist!")
}
if(inherits(mcols(genes)$symbol, "list") | inherits(mcols(genes)$symbol, "SimpleList")){
stop("Found a list in genes symbol! This is an incorrect format. Please correct your genes!")
}
#ArchR:::.startLogging(logFile = logFile)
#ArchR:::.logThis(mget(names(formals()),sys.frame(sys.nframe())), "addGeneScoreMatrix Input-Parameters", logFile = logFile)
#Valid GRanges
genes <- ArchR:::.validGRanges(genes)
peaks <- ArchR:::.validGRanges(peaks)
i = 1
ArrowFile <- ArrowFiles[i]
sampleName <- ArchR:::.sampleName(ArrowFile)
if(is.null(tmpFile)){
tmpFile <- ArchR:::.tempfile(pattern = paste0("tmp-", ArchR:::.sampleName(ArrowFile)))
}
#Check
o <- h5closeAll()
#o <- ArchR:::.createArrowGroup(ArrowFile = ArrowFile, group = matrixName, force = force, logFile = logFile)
# remove any chromosomes which we do not want to inlcude (X and Y in this case)
geneRegions <- genes[BiocGenerics::which(seqnames(genes) %bcni% excludeChr)]
peakRegions <- peaks[BiocGenerics::which(seqnames(peaks) %bcni% excludeChr)]
print(paste0("Before filtering out the X and Y Chromosome we have ", length(genes), " genes, afterwards we have ", length(geneRegions), "."))
## [1] "Before filtering out the X and Y Chromosome we have 24368 genes, afterwards we have 24334."
geneRegions %>% head
## GRanges object with 6 ranges and 2 metadata columns:
## seqnames ranges strand | gene_id symbol
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr1 3214482-3671498 - | 497097 Xkr4
## [2] chr1 4290846-4409241 - | 19888 Rp1
## [3] chr1 4490928-4497354 - | 20671 Sox17
## [4] chr1 4773198-4785726 - | 27395 Mrpl15
## [5] chr1 4807893-4846735 + | 18777 Lypla1
## [6] chr1 4857694-4897909 + | 21399 Tcea1
## -------
## seqinfo: 21 sequences from mm10 genome
# add information on chromsome names to the GRanges object
seqlevels(geneRegions) <- as.character(unique(seqnames(geneRegions)))
geneRegions <- geneRegions[!is.na(mcols(geneRegions)$symbol)]
print(paste0("After removing missing gene names, we are left with ", length(geneRegions), " genes"))
## [1] "After removing missing gene names, we are left with 24333 genes"
seqlevels(peakRegions) <- as.character(unique(seqnames(peakRegions)))
#Create Gene Regions Then Remove Strand Column
if(useTSS){
ArchR:::.logMessage(paste0(sampleName, " .addGeneScoreMat useTSS = TRUE"))
distMethod <- "GenePromoter"
geneRegions$geneStart <- start(resize(geneRegions, 1, "start"))
geneRegions$geneEnd <- start(resize(geneRegions, 1, "end"))
# in the ranges column we want to have the gene start coordinates
geneRegions <- resize(geneRegions, 1, "start")
peakRegions$peakStart <- start(resize(peakRegions, 1, "start"))
peakRegions$peakEnd <- end(resize(peakRegions, 1, "start"))
# this will be ignored for now, since we only want to use the TSS as 1bp size
# if(extendTSS){
# geneRegions <- extendGR(gr = geneRegions, upstream = geneUpstream, downstream = geneDownstream)
# }
geneRegions$geneWeight <- geneScaleFactor
# This will be ignored for now, since we do not want to use gene bodies, but only TSS
}else{
ArchR:::.logMessage(paste0(sampleName, " .addGeneScoreMat useTSS = FALSE"))
distMethod <- "GeneBody"
geneRegions$geneStart <- start(resize(geneRegions, 1, "start"))
geneRegions$geneEnd <- start(resize(geneRegions, 1, "end"))
geneRegions <- extendGR(gr = geneRegions, upstream = geneUpstream, downstream = geneDownstream)
m <- 1 / width(geneRegions)
geneRegions$geneWeight <- 1 + m * (geneScaleFactor - 1) / (max(m) - min(m))
}
#ArchR:::.logDiffTime(sprintf("Computing Gene Scores using distance relative to %s! ", distMethod), tstart, logFile = logFile)
#Add Gene Index For ArrowFile
geneRegions <- sort(sortSeqlevels(geneRegions), ignore.strand = TRUE)
peakRegions <- sort(sortSeqlevels(peakRegions), ignore.strand = TRUE)
#ArchR:::.logThis(geneRegions, paste0(sampleName, " .addGeneScoreMat geneRegions"), logFile = logFile)
# split geneRegions into a list of GRanges object, one for each chromosome
#and add indices for each chromosome
geneRegions <- split(geneRegions, seqnames(geneRegions))
geneRegions <- lapply(geneRegions, function(x){
mcols(x)$idx <- seq_along(x)
return(x)
})
peakRegions <- split(peakRegions, seqnames(peakRegions))
peakRegions <- lapply(peakRegions, function(x){
mcols(x)$idx <- seq_along(x)
return(x)
})
# get peak names for the peak matrix
peak_df <- ArchR:::.getFeatureDF(ArrowFile, subGroup = "PeakMatrix")
peak_df <- peak_df %>% as.data.frame() %>%
unite("rownames", seqnames, start, sep = ":", remove = FALSE) %>%
unite("rownames", rownames, end, sep = "-", remove = FALSE)
#Blacklist Split
if(!is.null(blacklist)){
if(length(blacklist) > 0){
# create a list of length of number of chromosomes
# each element of the list contains the GRanges for the corresponding chromosome only
blacklist <- split(blacklist, seqnames(blacklist))
}
}
#Get all cell ids before constructing matrix
if(is.null(cellNames)){
cellNames <- ArchR:::.availableCells(ArrowFile)
}
if(!is.null(allCells)){
cellNames <- cellNames[cellNames %in% allCells]
}
tstart <- Sys.time()
#########################################################################################################
#First we will write gene scores to a temporary path! rhdf5 delete doesnt actually delete the memory!
#########################################################################################################
totalGS <- ArchR:::.safelapply(seq_along(geneRegions), function(z){ # we loop over the z chromosomes here
# geneRegions is a list with one Granges object for each chromosome
totalGSz <- tryCatch({
ArchR:::.logDiffTime(sprintf("Creating Temp GeneScoreMatrix for %s, Chr (%s of %s)!", sampleName, z, length(geneRegions)),
tstart, verbose = FALSE, logFile = logFile)
#Get Gene Starts
# extract Granges for chromosome z
geneRegionz <- geneRegions[[z]]
# order the entire GRanges object according to index
geneRegionz <- geneRegionz[order(geneRegionz$idx)]
# get the chromosome information for this gene Region
chrz <- paste0(unique(seqnames(geneRegionz)))
peakRegionz <- peakRegions[[z]]
peakRegionz <- peakRegionz[order(peakRegionz$idx)]
chrp <- paste0(unique(seqnames(peakRegionz)))
stopifnot(chrz == chrp)
# get peak matrix
peak_mat <- ArchR:::.getMatFromArrow(ArrowFile, binarize = FALSE, cellNames = cellNames, useMatrix = "PeakMatrix")
# set rownames for the peak matrix
rownames(peak_mat) <- peak_df$rownames
# get peak matrix for cellnames
subset_peak_mat <- peak_mat[rownames(peak_mat) %in% names(peakRegionz), ]
#stopifnot(max(subset_peak_mat <= 4))
mcols(peakRegionz)$middle <- start(peakRegionz) + width(peakRegionz)/2
uniquePeaks <- sort(unique(peakRegionz$middle))
rm(peak_mat)
gc()
#Clean Memory
#rm(uniqIns, fragSt, fragEd, fragBC)
gc()
#Time to Overlap Gene Windows
#### maybe for later
if(useGeneBoundaries){
geneStartz <- start(resize(geneRegionz, 1, "start"))
geneEndz <- start(resize(geneRegionz, 1, "end"))
pminGene <- pmin(geneStartz, geneEndz)
pmaxGene <- pmax(geneStartz, geneEndz)
idxMinus <- BiocGenerics::which(strand(geneRegionz) != "-")
pReverse <- rep(max(extendDownstream), length(pminGene))
pReverse[idxMinus] <- rep(max(extendUpstream), length(idxMinus))
pReverseMin <- rep(min(extendDownstream), length(pminGene))
pReverseMin[idxMinus] <- rep(min(extendUpstream), length(idxMinus))
pForward <- rep(max(extendUpstream), length(pminGene))
pForward[idxMinus] <- rep(max(extendDownstream), length(idxMinus))
pForwardMin <- rep(min(extendUpstream), length(pminGene))
pForwardMin[idxMinus] <- rep(min(extendDownstream), length(idxMinus))
################################################################
#We will test when genes pass by another gene promoter
################################################################
#Start of Range is based on the max observed gene ranged <- direction
s <- pmax(
c(1, pmaxGene[-length(pmaxGene)] + tileSize),
pminGene - pReverse
)
s <- pmin(pminGene - pReverseMin, s)
#End of Range is based on the max observed gene ranged -> direction
e <- pmin(
c(pminGene[-1] - tileSize, pmaxGene[length(pmaxGene)] + pForward[length(pmaxGene)]),
pmaxGene + pForward
)
e <- pmax(pmaxGene + pForwardMin, e)
extendedGeneRegion <- IRanges(start = s, end = e)
idx1 <- which(pminGene - pReverseMin < start(extendedGeneRegion))
if(length(idx1) > 0){
stop("Error in gene boundaries minError")
}
idx2 <- which(pmaxGene + pForwardMin > end(extendedGeneRegion))
if(length(idx2) > 0){
stop("Error in gene boundaries maxError")
}
rm(s, e, pReverse, pReverseMin, pForward, pForwardMin, geneStartz, geneEndz, pminGene, pmaxGene)
###### interesting for me
}else{
# first we extend the gene region -> IRanges object, which contains start
# and end of the 200,000 bp gene regions/windows (+/- 100,000 from TSS)
extendedGeneRegion <- ranges(suppressWarnings(extendGR(geneRegionz, upstream = max(extendUpstream), downstream = max(extendDownstream))))
}
# Here I would like to use peaks which overlap instead of tiles
# each row of tmp contains index of GeneRegion and the index of Tiles with
# which it overlaps
# convert peakRegionz to IRanges object as well
tmp <- suppressWarnings(findOverlaps(extendedGeneRegion, ranges(peakRegionz)))
print(paste0("In total there are ",
subjectLength(tmp), " peaks overlapping with gene regions found on this chromosome"))
print(paste0("The total number of peaks found on this chromosome is ", length(peakRegionz)))
print(tmp %>% as.data.frame() %>%
group_by(queryHits) %>% # gene region
summarize(n = n()) %>% # get number of peaks overlapping with a gene region
ggplot() + geom_bar(aes(x = n)) +
labs(title = "number of peaks per gene region of size +/- 100kb from TSS", x = "number of peaks"))
# by extracting the indices of genes and tiles which overlap and computing
# the distances between them we get for each match the distance between gene and tile
# we want to compute the distance to the peak middle
peak_middle_region <- peakRegionz
start(peak_middle_region) = start(peak_middle_region) + floor(width(peak_middle_region) / 2)
x <- distance(ranges(geneRegionz)[queryHits(tmp)],
ranges(resize(peakRegionz, width = 1))[subjectHits(tmp)])
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of absolute distances between genes and tiles within a gene region",
x = "distance"))
#Determine Sign for Distance relative to strand (Directionality determined based on dist from gene start)
# negative distance meaning upstrea, positive distance downstream of gene
# get Minus strand coordinates
isMinus <- BiocGenerics::which(strand(geneRegionz) == "-")
# subtract the gene start coordinate from the tile start coordinate -> relative distances
signDist <- sign(start(ranges(peakRegionz))[subjectHits(tmp)] -
start(resize(geneRegionz,1,"start"))[queryHits(tmp)])
# convert the direction of distance for all distances corresponding to the negative strand
signDist[isMinus] <- signDist[isMinus] * -1
#Correct the orientation for the distance!
# x contains absolute distances, but we want to give direction to distances
x <- x * signDist
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of relative distances between genes and tiles within a gene region",
x = "relative distance to TSS"))
#Evaluate Input Model -> compute distance weights
x <- eval(parse(text=geneModel))
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of distance weights",
x = "distance weight"))
print(ggplot() + geom_histogram(aes(x = x), bins = 500) +
labs(title = "Distribution of distance weights",
x = "distance weight", y = "log10(count)") + scale_y_log10())
print("distanc weights ")
x %>% head
#Get Gene Weights Related to Gene Width, in our case simply multiply
# everything by 5, because we do not use the size of the gene
x <- x * mcols(geneRegionz)$geneWeight[queryHits(tmp)]
print("Distance weights scaled by gene weight, which is constant in our case: ")
x %>% head
#Remove Blacklisted Tiles!
if(!is.null(blacklist)){
if(length(blacklist) > 0){
blacklistz <- blacklist[[chrz]]
if(is.null(blacklistz) | length(blacklistz) > 0){
tilesBlacklist <- 1 * (!overlapsAny(ranges(peakRegionz), ranges(blacklistz)))
if(sum(tilesBlacklist == 0) > 0){
x <- x * tilesBlacklist[subjectHits(tmp)] #Multiply Such That All Blacklisted Tiles weight is now 0!
}
}
}
}
#Creating Sparse Matrix
# this is genes x peaks, with distance weights
tmp <- Matrix::sparseMatrix(
i = queryHits(tmp),
j = subjectHits(tmp),
x = x,
dims = c(length(geneRegionz), nrow(subset_peak_mat))
)
# Number of peaks for each gene
print(ggplot() + geom_histogram(aes(x = rowSums(tmp)), bins = 200) +
labs(title = "Sum of distance weights for each gene",
x = "total distance weights per gene")) #+ scale_y_log10()
# Number of genes for each peak
print(ggplot() + geom_histogram(aes(x = colSums(tmp)), bins = 200) +
labs(title = "Sum of distance weights for each peak",
x = "total distance weights per peak"))
#Calculate Gene Scores
matGS <- tmp %*% subset_peak_mat
colnames(matGS) <- cellNames
totalGSz <- Matrix::colSums(matGS)
print(paste0("For chromosome ", chrz, " there are ", length(totalGSz), " cells, meaning that there will be as many total gene activity scores."))
print(ggplot() + geom_histogram(aes(x = totalGSz), bins = 500) +
labs(title = "Total Gene activity in each cell", x = "total gene activity"))
#Save tmp file
.safeSaveRDS(matGS, file = paste0(tmpFile, "-", chrz, ".rds"), compress = FALSE)
#Clean Memory
rm(isMinus, signDist, extendedGeneRegion, uniqueTiles)
rm(matGS, tmp)
gc()
totalGSz
}, error = function(e){
errorList <- list(
ArrowFile = ArrowFile,
geneRegions = geneRegions,
blacklist = blacklist,
chr = chrz,
totalGSz = if(exists("totalGSz", inherits = FALSE)) totalGSz else "totalGSz",
matGS = if(exists("matGS", inherits = FALSE)) matGS else "matGS"
)
# ArchR:::.logError(e, fn = "ArchR:::.addGeneScoreMat TmpGS", info = sampleName, errorList = errorList, logFile = logFile)
})
totalGSz
})#, threads = subThreads) %>% Reduce("+", .)
## [1] "In total there are 12260 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 12260"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 2 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr1 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 14119 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 14119"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr2 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 8959 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 8959"
## Warning: Transformation introduced infinite values in continuous y-axis
## Removed 2 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr3 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 12236 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 12236"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 1 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr4 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 11745 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 11745"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr5 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 9910 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 9910"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr6 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 11130 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 11130"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr7 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 9851 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 9851"
## Warning: Transformation introduced infinite values in continuous y-axis
## Removed 1 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr8 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 10205 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 10205"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr9 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 7661 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 7661"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr12 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 8094 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 8094"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 4 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr13 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 7151 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 7151"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 6 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr14 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 7375 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 7375"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr15 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 5649 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 5649"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 21 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr16 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 7720 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 7720"
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr17 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 5809 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 5809"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 17 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr18 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 5421 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 5421"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 3 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chr19 there are 7702 cells, meaning that there will be as many total gene activity scores."
## [1] "In total there are 3033 peaks overlapping with gene regions found on this chromosome"
## [1] "The total number of peaks found on this chromosome is 3033"
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 61 rows containing missing values (geom_bar).
## [1] "distanc weights "
## [1] "Distance weights scaled by gene weight, which is constant in our case: "
## [1] "For chromosome chrX there are 7702 cells, meaning that there will be as many total gene activity scores."